***********************************************************************************
* Linking Occuptional Characteristics 
* last modified 1/17/2023
*
* Note: Clean OEWS
*
***********************************************************************************


clear
capture log close 
set more off

***************
* Directories *
***************

cd "/Users/"

global dofile			=	"./Code/NEW-OEWS"
global matrix_data		=   "./Data/National Employment Matrix"
global oews_data		= 	"./Data/OEWS/all_excel"
global onet_data		= 	"./OccLink/ONET/ONET data/merged"
global results 			=	"./Results"
global temp				= 	"./Temp"


set scheme plotplainblind, permanently



*******************************************************************************
** Clean OEWS data
*******************************************************************************

** keep variables 
foreach yr of num 1998/2021  {
	
	import excel "${oews_data}/national_M`yr'_dl", sheet("national_dl") firstrow clear case(lower) allstring
	capture drop year
	gen  soc   = occ_code
	gen  year	   = `yr'
	
	keep soc occ_code h_median a_median h_mean a_mean h_pct10 h_pct25 h_pct75 ///
		 h_pct90 a_pct10 a_pct25 a_pct75 a_pct90 tot_emp year

	save "${temp}/oews`yr'", replace
}

** combine data from different years 
use "${temp}/oews1998", clear 
append using "${temp}/oews1999" "${temp}/oews2000" ///
			 "${temp}/oews2001" "${temp}/oews2002" "${temp}/oews2003" "${temp}/oews2004" ///
			 "${temp}/oews2005" "${temp}/oews2006" "${temp}/oews2007" "${temp}/oews2008" ///
			 "${temp}/oews2009" "${temp}/oews2010" "${temp}/oews2011" "${temp}/oews2012" ///
			 "${temp}/oews2013" "${temp}/oews2014" "${temp}/oews2015" "${temp}/oews2016" /// 	  
			 "${temp}/oews2017" "${temp}/oews2018" "${temp}/oews2019" "${temp}/oews2020" ///
			 "${temp}/oews2021"

** replace missing values 
list tot_emp if missing(real(tot_emp))
for var tot_emp: replace X = "" if X == "**"

list a_median if missing(real( a_median ))
for var h_median  a_median h_mean a_mean h_pct10 h_pct25 h_pct75 h_pct90 a_pct10 a_pct25 a_pct75 a_pct90: replace X = "" if X == "*" | X == "***" 


** replace top coding values 
for var h_median  a_median  a_mean a_pct10 a_pct25 a_pct75 a_pct90: replace X="145600" if X=="#" & year <2008
for var h_median  h_median  h_mean h_pct10 h_pct25 h_pct75 h_pct90: replace X="70" if X=="#" & year <2008

for var h_median  a_median  a_mean a_pct10 a_pct25 a_pct75 a_pct90: replace X="166400" if X=="#" & inrange(year, 2008, 2010)
for var h_median  h_median  h_mean h_pct10 h_pct25 h_pct75 h_pct90: replace X="80" if X=="#" & inrange(year, 2008, 2010)

for var h_median  a_median  a_mean a_pct10 a_pct25 a_pct75 a_pct90: replace X="187200" if X=="#" & inrange(year, 2011, 2015)
for var h_median  h_median  h_mean h_pct10 h_pct25 h_pct75 h_pct90: replace X="90" if X=="#" & inrange(year, 2011, 2015)

for var h_median  a_median  a_mean a_pct10 a_pct25 a_pct75 a_pct90: replace X="208000" if X=="#" & year >=2016
for var h_median  h_median  h_mean h_pct10 h_pct25 h_pct75 h_pct90: replace X="100" if X=="#" & year >= 2016


** generate numeric values 
destring tot_emp h_median a_median h_mean a_mean h_pct10 h_pct25 h_pct75 h_pct90 a_pct10 a_pct25 a_pct75 a_pct90, replace


** remove duplicates (not sure why they exist)
duplicates drop soc year, force

save "${temp}/oews98-21combined", replace
